Breaking down EDA for a set of 4 datasets¶
import numpy as np
import pandas as pd
from IPython.display import display, display_html , HTML
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_curve
from sklearn.model_selection import learning_curve, cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
characteristics = pd.read_csv('../input/2019-database-of-road-traffic-injuries/caracteristiques-2019.csv')
characteristics.name = 'characteristics'
places = pd.read_csv('../input/2019-database-of-road-traffic-injuries/lieux-2019.csv')
places.name = 'places'
drivers = pd.read_csv('../input/2019-database-of-road-traffic-injuries/usagers-2019.csv')
drivers.name = 'drivers'
vehicles = pd.read_csv('../input/2019-database-of-road-traffic-injuries/vehicules-2019.csv')
vehicles.name = 'vehicles'
datasets = [characteristics,places,vehicles,drivers]
characteristics = characteristics.set_index('Num_Acc')
places = places.set_index('Num_Acc')
vehicles = vehicles.set_index('id_vehicule')
drivers = drivers.set_index('id_vehicule')
pd.set_option('display.max_row',max(characteristics.shape[0],places.shape[0],drivers.shape[0],vehicles.shape[0]))
pd.set_option('display.max_column',max(characteristics.shape[1],places.shape[1],drivers.shape[1],vehicles.shape[1]))
for df in datasets:
print ("The dataset",df.name,"has",df.shape[0],"rows and",df.shape[1],"columns")
The dataset characteristics has 58840 rows and 15 columns The dataset places has 58840 rows and 18 columns The dataset vehicles has 100710 rows and 11 columns The dataset drivers has 132977 rows and 15 columns
display(HTML('<h1>characteristics</h1>'))
display(characteristics.head())
display(HTML('<h1>vehicles</h1>'))
display(vehicles.head())
display(HTML('<h1>drivers</h1>'))
display(drivers.head())
display(HTML('<h1>places</h1>'))
display(places.head())
characteristics
| jour | mois | an | hrmn | lum | dep | com | agg | int | atm | col | adr | lat | long | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Num_Acc | ||||||||||||||
| 201900000001 | 30.0 | 11.0 | 2019.0 | 0.062500 | 4.0 | 93 | 93053 | 1.0 | 1.0 | 1.0 | 2.0 | AUTOROUTE A3 | 488962100.0 | 24701200.0 |
| 201900000002 | 30.0 | 11.0 | 2019.0 | 0.118056 | 3.0 | 93 | 93066 | 1.0 | 1.0 | 1.0 | 6.0 | AUTOROUTE A1 | 489307000.0 | 23688000.0 |
| 201900000003 | 28.0 | 11.0 | 2019.0 | 0.635417 | 1.0 | 92 | 92036 | 1.0 | 1.0 | 1.0 | 4.0 | AUTOROUTE A86 | 489358718.0 | 23191744.0 |
| 201900000004 | 30.0 | 11.0 | 2019.0 | 0.847222 | 5.0 | 94 | 94069 | 1.0 | 1.0 | 1.0 | 4.0 | A4 | 488173295.0 | 24281502.0 |
| 201900000005 | 30.0 | 11.0 | 2019.0 | 0.166667 | 3.0 | 94 | 94028 | 1.0 | 1.0 | 1.0 | 2.0 | A86 INT | 487763620.0 | 24332540.0 |
vehicles
| Num_Acc | num_veh | senc | catv | obs | obsm | choc | manv | motor | occutc | |
|---|---|---|---|---|---|---|---|---|---|---|
| id_vehicule | ||||||||||
| 138Â 306Â 524 | 201900000001 | B01 | 2 | 7 | 0 | 2 | 5 | 23 | 1 | NaN |
| 138Â 306Â 525 | 201900000001 | A01 | 2 | 17 | 1 | 0 | 3 | 11 | 1 | NaN |
| 138Â 306Â 523 | 201900000002 | A01 | 1 | 7 | 4 | 0 | 1 | 0 | 1 | NaN |
| 138Â 306Â 520 | 201900000003 | A01 | 1 | 7 | 0 | 2 | 1 | 2 | 1 | NaN |
| 138Â 306Â 521 | 201900000003 | B01 | 1 | 7 | 1 | 0 | 4 | 2 | 1 | NaN |
drivers
| Num_Acc | num_veh | place | catu | grav | sexe | an_nais | trajet | secu1 | secu2 | secu3 | locp | actp | etatp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id_vehicule | ||||||||||||||
| 138Â 306Â 524 | 201900000001 | B01 | 2 | 2 | 4 | 2 | 2002 | 0 | 1 | 0 | -1 | -1 | -1 | -1 |
| 138Â 306Â 524 | 201900000001 | B01 | 1 | 1 | 4 | 2 | 1993 | 5 | 1 | 0 | -1 | -1 | -1 | -1 |
| 138Â 306Â 525 | 201900000001 | A01 | 1 | 1 | 1 | 1 | 1959 | 0 | 1 | 0 | -1 | -1 | -1 | -1 |
| 138Â 306Â 523 | 201900000002 | A01 | 1 | 1 | 4 | 2 | 1994 | 0 | 1 | 0 | -1 | -1 | -1 | -1 |
| 138Â 306Â 520 | 201900000003 | A01 | 1 | 1 | 1 | 1 | 1996 | 0 | 1 | 0 | -1 | -1 | 0 | -1 |
places
| catr | voie | v1 | v2 | circ | nbv | vosp | prof | pr | pr1 | plan | lartpc | larrout | surf | infra | situ | vma | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Num_Acc | |||||||||||||||||
| 201900000001 | 1.0 | 3 | 0.0 | NaN | 3.0 | 10.0 | 0.0 | 1.0 | 6.0 | 900.0 | 2.0 | NaN | NaN | 1.0 | 2.0 | 1.0 | 70.0 |
| 201900000002 | 1.0 | 1 | 0.0 | NaN | 1.0 | 2.0 | 0.0 | 4.0 | 3.0 | 845.0 | 2.0 | NaN | NaN | 1.0 | 0.0 | 1.0 | 70.0 |
| 201900000003 | 1.0 | 86 | 0.0 | NaN | 3.0 | 8.0 | 0.0 | 1.0 | 10.0 | 500.0 | 3.0 | NaN | NaN | 1.0 | 0.0 | 1.0 | 90.0 |
| 201900000004 | 1.0 | 4 | 0.0 | NaN | 3.0 | 5.0 | 0.0 | 1.0 | 2.0 | 299.0 | 1.0 | NaN | NaN | 1.0 | 0.0 | 1.0 | 90.0 |
| 201900000005 | 1.0 | 86 | 0.0 | INT | 1.0 | 3.0 | 0.0 | 1.0 | 41.0 | 0.0 | 3.0 | NaN | NaN | 1.0 | 2.0 | 1.0 | 90.0 |
Expand this to read the features description
¶
Num_Acc¶
Identification number of the accident.
jour¶
Day of the accident.
mois¶
Month of the accident.
an¶
Year of accident.
hrmn¶
Hour and minutes of the accident. This one is tricky, it correspond to a percentage of 24h (your turn to convert it into day time)
lum¶
Light: lighting conditions in which the accident occurred:
- Full day
- Twilight or dawn
- Night without public lighting
- Night with public lighting not on
- Night with public lighting on
dep¶
Department: Code INSEE (National Institute of Statistics and Economic Studies) of department (2A Corse-du-Sud. 2B Haute-Corse).
com¶
Municipality: The municipality number is a code given by INSEE. The code is made up of the code INSEE of the department followed by 3 digits.
agg¶
Location :
- Outside agglomeration
- In built-up areas
int¶
Intersection:
- Excluding intersection
- Intersection in X
- T-intersection
- Y intersection
- Intersection with more than 4 branches
- Roundabout
- Place
- Level crossing
- Other intersection
atm¶
Atmospheric conditions:
-1. Not specified
- Normal
- Light rain
- Heavy rain
- Snow. hail
- Fog. smoke
- Strong wind. storm
- Dazzling weather
- Cloudy weather
- Other
col¶
Collision type:
-1. Not specified
- Two vehicles. frontal
- Two vehicles. from the rear
- Two vehicles. from the side
- Three vehicles and more. in a chain
- Three or more vehicles. multiple collisions
- Other collision 7. No collision
adr¶
Postal address: variable entered for accidents occurring in built-up areas.
lat¶
Latitude
Long¶
Longitude
catr¶
Road category:
- Highway
- National road
- Departmental road
- Communal roads
- Outside the public network
- Parking lot open to public traffic
- Urban metropolis roads
- other
voie¶
Route number.
V1¶
Numerical index of the road number (example: 2 bis, 3 ter etc.).
V2¶
Alphanumeric road index letter.
circ¶
Traffic regime:
-1. Not specified
- One way
- Bidirectional
- A separate carriageway
- With variable assignment channels
nbv¶
Total number of traffic lanes.
vosp¶
Indicates the existence of a reserved lane, regardless of whether or
not the accident took place on this way.
-1. Not specified 0. Not applicable
- Cycle path
- Cycle lane
- Reserved lane
prof¶
Longitudinal profile describes the gradient of the road at the location of the accident:
-1. Not specified
- Flat
- Slope
- hilltop
- Bottom of coast
pr¶
Number of the associated PR (number of the upstream terminal). The value -1 means that the PR is not informed.
pr1¶
Distance in meters from the PR (in relation to the upstream terminal). The value -1 means that the PR is not informed.
plan¶
Plan layout:
-1. Not specified
- rectilinear part
- In a curve to the left
- In a curve to the right 4. In "S"
lartpc¶
Width of the central reservation (TPC) if it exists (in m).
larrout¶
Width of the roadway used for vehicular traffic is not included in the stopping strips emergency, TPC and parking spaces (in m).
surf¶
Surface condition:
-1. Not specified
- Normal
- Wet
- Puddles
- Flooded
- Snowy
- Mud
- Icy
- Fat. oil
- Other
infra¶
Development. Infrastructure:
-1. Not specified 0. None
- Underground. tunnel
- Bridge. flyover
- Exchanger or connection sling
- Railroad
- Crossroads
- Pedestrian zone
- Toll zone
- Site
- Others
situ¶
Situation of the accident:
-1. Not specified 0. None
- On the road
- On emergency lane
- On the shoulder
- On the sidewalk
- On a cycle path
- On other special track
- Others
vma¶
Maximum authorized speed at the scene and at the time of the accident.
vehicle_id¶
Unique identifier of the vehicle used for each user occupying this vehicle (including pedestrians who are attached to the vehicles which collided with them). Numerical code.
Num_Veh¶
Identifier of the vehicle taken back for each of the users occupying this vehicle (including pedestrians who are attached to the vehicles which collided with them). Alphanumeric code.
senc¶
Flow direction :
-1. Not specified 0. Unknown
- PK or PR or increasing postal address number
- PK or PR or decreasing postal address number
- Lack of reference
catv¶
Vehicle category:
- Not determinable
- Bicycle
- Moped <50cm3
- Cart (Quadricycle with bodywork motor) (formerly "cart or motor tricycle")
- Reference not used since 2006 (registered scooter)
- Reference unused since 2006 (motorcycle)
- Reference unused since 2006 (sidecar)
- VL only
- Reference unused since 2006 (VL + caravan)
- Reference not used since 2006 (light vehicles + trailer)
- VU only 1.5T <= PTAC <= 3.5T with or without trailer (formerly VU only 1.5T <= PTAC <= 3.5T)
- Reference not used since 2006 (VU (10) + caravan)
- Reference not used since 2006 (VU (10) + trailer)
- PL only 3.5T <PTCA <= 7,5T
- PL only > 7.5T
- PL> 3,5T + trailer
- Road tractor only
- Road tractor + semi-trailer
- Reference not used since 2006 (public transport)
- Reference not used since 2006 (tram)
- Special gear
- Farm tractor
- Scooter <50 cm3
- Motorcycle> 50 cm3 and <= 125 cm3
- Scooter> 50 cm3 and <= 125 cm3
- Motorcycle> 125 cm3
- Scooter> 125 cm3
- Light quad <= 50 cm3 (Quadricycle without bodywork engine)
- Heavy quad> 50 cm3 (Quadricycle without bodywork engine)
- Bus
- Coach
- Train
- Tram
- 3WD <= 50 cm3
- 3WD> 50 cm3 <= 125 cm3
- 3WD> 125 cm3
- EDP with motor
- EDP without motor
- VAE
- Other vehicle
obs¶
Fixed obstacle struck:
-1. Not specified 0. Not applicable
- Parked vehicle
- Tree
- Metal slide
- Concrete slide
- Other slide
- Building, wall, bridge pier
- Vertical signage support or emergency call station
- Post
- Street furniture
- Parapet
- Island, refuge, upper terminal
- Sidewalk edge
- Ditch, embankment, rock face
- Other fixed obstacle on the road
- Other fixed obstacle on sidewalk or shoulder
- Clearance of the roadway without obstacle
- Nozzle. aqueduct head
obsm¶
Movable obstacle struck:
-1. Not specified 0. None
- Pedestrian
- Vehicle
- Rail vehicle
- Domestic animal
- Wild animal
- Other
choc¶
Initial shock point: -1. Not specified 0. None
- Before
- Right front
- Front left
- Rear
- Right back
- Left rear
- Right side
- Left side
- Multiple shocks (rolls)
manv¶
Main maneuver before the accident:
-1. Not specified 0. Unknown
- Without change of direction
- Same direction, same row
- Between 2 lines
- In reverse
- In the wrong way
- Crossing the central reservation
- In the bus lane, in the same direction
- In the bus lane, in the opposite direction
- By inserting
- By making a U-turn on the road Changing lane
- Left
- Right Deported
- Left
- Right Turning
- Left
- Right Exceeding
- Left
- Right Various
- Crossing the road
- Parking maneuver
- Avoidance maneuver
- Door opening
- Stopped (except parking)
- Parked (with occupants
- Traveling on sidewalk
- Other maneuvers
motor¶
Vehicle engine type:
-1. Not specified 0. Unknown
- Hydrocarbons
- Electric hybrid
- Electric
- Hydrogen
- Human
- Other
occutc¶
Number of occupants in public transport.
id_vehicule¶
Unique identifier of the vehicle used for each user occupying this vehicle (including pedestrians who are attached to the vehicles which collided with them). Numerical code.
Num_Veh¶
Identifier of the vehicle taken back for each of the users occupying this vehicle (including pedestrians who are attached to the vehicles which collided with them). Alphanumeric code.
place¶
Used to locate the space occupied in the vehicle by the user at the time of the accident Check on this link for the pattern : https://ibb.co/NsTxbXP
catu¶
User category:
- Driver
- Passenger
- Pedestrian
grav¶
Severity of user injury, injured users are classified into three categories of
victims plus unharmed:
- Unharmed
- Killed
- Injured hospitalized
- Slightly injured
sexe¶
Driver gender:
- Male
- Female
An_nais¶
Year of birth of the driver
trajet¶
Reason for travel at the time of the accident:
-1. Not specified 0. Not specified
- Home. work
- Home. school
- Shopping. shopping
- Professional use
- Walk. leisure
- Other
Security equipment until 2018 was divided into 2 variables: existence and use. From 2019, this concerns use with up to 3 possible devices for the same user (especially for motorcyclists whose helmets and gloves are compulsory).
secu1¶
The character intelligence indicates the presence and use of safety equipment:
-1. Not specified 0. No equipment
- Belt 12
- Helmet
- Children's device
- reflective vest
- Airbag (2WD / 3WD)
- Gloves (2WD / 3WD)
- Gloves + Airbag (2WD / 3WD)
- Not determinable
- Other
secu2¶
The character intelligence indicates the presence and use of safety equipment:
-1. Not specified 0. No equipment
- Belt
- Helmet
- Children's device
- reflective vest
- Airbag (2WD / 3WD)
- Gloves (2WD / 3WD)
- Gloves + Airbag (2WD / 3WD)
- Not determinable
- Other
secu3¶
The character intelligence indicates the presence and use of safety equipment:
-1. Not specified 0. No equipment
- Belt
- Helmet
- Children's device
- reflective vest
- Airbag (2WD / 3WD)
- Gloves (2WD / 3WD)
- Gloves + Airbag (2WD / 3WD)
- Not determinable
- Other
locp¶
Localisation du piéton :
-1. Non renseigné 0. Sans objet Sur chaussée :
- A + 50 m du passage piéton
- A. 50 m du passage piéton Sur passage piéton :
- Sans signalisation lumineuse
- Avec signalisation lumineuse Divers :
- Sur trottoir
- Sur accotement
- Sur refuge ou BAU
- Sur contre allée
- Inconnue
actp¶
Pedestrian action:
-1. Not specified Moving 0. Not specified or not applicable
- Direction of colliding vehicle
- Opposite direction of the vehicle Various
- Crossing
- Masked
- Playing. running
- With animal
- Other A. Get on / off the vehicle B. Unknown
etatp¶
This variable is used to specify whether the injured pedestrian was alone or not:
-1. Not specified
- Alone
- Accompanied
- In a group
display(HTML('<h1><center>Missing values of the different tables (%)</center></h1>'))
a = pd.DataFrame(np.transpose(np.array((characteristics.columns,round(characteristics.isna().sum()/characteristics.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])
b = pd.DataFrame(np.transpose(np.array((vehicles.columns,round(vehicles.isna().sum()/vehicles.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])
c = pd.DataFrame(np.transpose(np.array((places.columns,round(places.isna().sum()/places.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])
d = pd.DataFrame(np.transpose(np.array((drivers.columns,round(drivers.isna().sum()/drivers.shape[0]*100,2)),dtype=object,)),columns=['features','missing_rate'])
def highlight_greaterthan(x):
if x.missing_rate > 80:
return ['background-color: #FFCECE']*2
if x.missing_rate > 40:
return ['background-color: #FFE9CE']*2
if x.missing_rate > 5:
return ['background-color: #FFFECE']*2
else:
return ['background-color: #CEFFFC']*2
a = a.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
'selector': 'caption',
'props': [
('color', '#585858'),
('font-size', '30px')
]
}])
b = b.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
'selector': 'caption',
'props': [
('color', '#585858'),
('font-size', '30px')
]
}])
c = c.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
'selector': 'caption',
'props': [
('color', '#585858'),
('font-size', '30px')
]
}])
d = d.style.apply(highlight_greaterthan, axis=1).set_table_styles([{
'selector': 'caption',
'props': [
('color', '#585858'),
('font-size', '30px')
]
}])
a_styler = a.set_table_attributes("style='display:inline'").set_caption('characteristics')
b_styler = b.set_table_attributes("style='display:inline'").set_caption('vehicles')
c_styler = c.set_table_attributes("style='display:inline'").set_caption('places')
d_styler = d.set_table_attributes("style='display:inline'").set_caption('drivers')
space = "\xa0" * 50
display_html(a_styler._repr_html_() + space + b_styler._repr_html_() + space + c_styler._repr_html_() + space + d_styler._repr_html_(), raw=True)
display(HTML('<h3><i>The values highlighted are the ones above a certain threshold of missing values</i></h3>'))
display(HTML('<h3><i>We will get rid of those for the rest of the notebook</i></h3>'))
Missing values of the different tables (%)
| features | missing_rate | |
|---|---|---|
| 0 | jour | 0.220000 |
| 1 | mois | 0.220000 |
| 2 | an | 0.220000 |
| 3 | hrmn | 0.220000 |
| 4 | lum | 0.220000 |
| 5 | dep | 0.220000 |
| 6 | com | 0.220000 |
| 7 | agg | 0.220000 |
| 8 | int | 0.220000 |
| 9 | atm | 0.220000 |
| 10 | col | 0.220000 |
| 11 | adr | 0.940000 |
| 12 | lat | 0.220000 |
| 13 | long | 0.220000 |
| features | missing_rate | |
|---|---|---|
| 0 | Num_Acc | 0.000000 |
| 1 | num_veh | 0.000000 |
| 2 | senc | 0.000000 |
| 3 | catv | 0.000000 |
| 4 | obs | 0.000000 |
| 5 | obsm | 0.000000 |
| 6 | choc | 0.000000 |
| 7 | manv | 0.000000 |
| 8 | motor | 0.000000 |
| 9 | occutc | 99.110000 |
| features | missing_rate | |
|---|---|---|
| 0 | catr | 0.040000 |
| 1 | voie | 5.030000 |
| 2 | v1 | 18.310000 |
| 3 | v2 | 92.920000 |
| 4 | circ | 5.390000 |
| 5 | nbv | 1.150000 |
| 6 | vosp | 1.180000 |
| 7 | prof | 0.070000 |
| 8 | pr | 12.450000 |
| 9 | pr1 | 12.960000 |
| 10 | plan | 0.060000 |
| 11 | lartpc | 99.640000 |
| 12 | larrout | 99.370000 |
| 13 | surf | 0.080000 |
| 14 | infra | 0.120000 |
| 15 | situ | 0.240000 |
| 16 | vma | 1.540000 |
| features | missing_rate | |
|---|---|---|
| 0 | Num_Acc | 0.000000 |
| 1 | num_veh | 0.000000 |
| 2 | place | 0.000000 |
| 3 | catu | 0.000000 |
| 4 | grav | 0.000000 |
| 5 | sexe | 0.000000 |
| 6 | an_nais | 0.000000 |
| 7 | trajet | 0.000000 |
| 8 | secu1 | 0.000000 |
| 9 | secu2 | 0.000000 |
| 10 | secu3 | 0.000000 |
| 11 | locp | 0.000000 |
| 12 | actp | 0.000000 |
| 13 | etatp | 0.000000 |
The values highlighted are the ones above a certain threshold of missing values
We will get rid of those for the rest of the notebook
Visualizing datasets dtypes 1 by 1
characteristics_dtypes = pd.DataFrame(np.transpose(np.array((characteristics.columns,characteristics.dtypes),dtype=object,)),columns=['features','dtype'])
vehicles_dtypes = pd.DataFrame(np.transpose(np.array((vehicles.columns,vehicles.dtypes),dtype=object,)),columns=['features','dtype'])
places_dtypes = pd.DataFrame(np.transpose(np.array((places.columns,places.dtypes),dtype=object,)),columns=['features','dtype'])
drivers_dtypes = pd.DataFrame(np.transpose(np.array((drivers.columns,drivers.dtypes),dtype=object,)),columns=['features','dtype'])
characteristics_dtypes = characteristics_dtypes.style.set_table_styles([{
'selector': 'caption',
'props': [
('color', '#585858'),
('font-size', '30px')
]
}])
vehicles_dtypes = vehicles_dtypes.style.set_table_styles([{
'selector': 'caption',
'props': [
('color', '#585858'),
('font-size', '30px')
]
}])
places_dtypes = places_dtypes.style.set_table_styles([{
'selector': 'caption',
'props': [
('color', '#585858'),
('font-size', '30px')
]
}])
drivers_dtypes = drivers_dtypes.style.set_table_styles([{
'selector': 'caption',
'props': [
('color', '#585858'),
('font-size', '30px')
]
}])
characteristics_dtypes_styler = characteristics_dtypes.set_table_attributes("style='display:inline'").set_caption('characteristics')
vehicles_dtypes_styler = vehicles_dtypes.set_table_attributes("style='display:inline'").set_caption('vehicles')
places_dtypes_styler = places_dtypes.set_table_attributes("style='display:inline'").set_caption('places')
drivers_dtypes_styler = drivers_dtypes.set_table_attributes("style='display:inline'").set_caption('drivers')
space = "\xa0" * 50
display_html(characteristics_dtypes_styler._repr_html_() + space + vehicles_dtypes_styler._repr_html_() + space +
places_dtypes_styler._repr_html_() + space + drivers_dtypes_styler._repr_html_(), raw=True)
| features | dtype | |
|---|---|---|
| 0 | jour | float64 |
| 1 | mois | float64 |
| 2 | an | float64 |
| 3 | hrmn | float64 |
| 4 | lum | float64 |
| 5 | dep | object |
| 6 | com | object |
| 7 | agg | float64 |
| 8 | int | float64 |
| 9 | atm | float64 |
| 10 | col | float64 |
| 11 | adr | object |
| 12 | lat | float64 |
| 13 | long | float64 |
| features | dtype | |
|---|---|---|
| 0 | Num_Acc | int64 |
| 1 | num_veh | object |
| 2 | senc | int64 |
| 3 | catv | int64 |
| 4 | obs | int64 |
| 5 | obsm | int64 |
| 6 | choc | int64 |
| 7 | manv | int64 |
| 8 | motor | int64 |
| 9 | occutc | float64 |
| features | dtype | |
|---|---|---|
| 0 | catr | float64 |
| 1 | voie | object |
| 2 | v1 | float64 |
| 3 | v2 | object |
| 4 | circ | float64 |
| 5 | nbv | float64 |
| 6 | vosp | float64 |
| 7 | prof | float64 |
| 8 | pr | float64 |
| 9 | pr1 | float64 |
| 10 | plan | float64 |
| 11 | lartpc | float64 |
| 12 | larrout | float64 |
| 13 | surf | float64 |
| 14 | infra | float64 |
| 15 | situ | float64 |
| 16 | vma | float64 |
| features | dtype | |
|---|---|---|
| 0 | Num_Acc | int64 |
| 1 | num_veh | object |
| 2 | place | int64 |
| 3 | catu | int64 |
| 4 | grav | int64 |
| 5 | sexe | int64 |
| 6 | an_nais | int64 |
| 7 | trajet | int64 |
| 8 | secu1 | int64 |
| 9 | secu2 | int64 |
| 10 | secu3 | int64 |
| 11 | locp | int64 |
| 12 | actp | object |
| 13 | etatp | int64 |
characteristics
for col in characteristics.select_dtypes("object"):
print('\n')
print('Number of values in "',col,'"', {characteristics[col].nunique()})
print(characteristics[col].unique())
print('\n')
print('------------------------------------------------')
Number of values in " dep " {107}
['93' '92' '94' '87' '69' '38' '34' '13' '988' '976' '974' '972' '2B' '91'
'86' '83' '80' '78' '77' '76' '72' '71' '67' '66' '64' '60' '51' '50'
'49' '45' '37' '35' '33' '31' '30' '29' '22' '19' '18' '17' '74' '81' '2'
'59' '95' '63' '62' '973' '2A' '84' '9' '73' '43' '10' '36' '16' '7' '21'
'40' '24' '4' '85' '27' '28' '52' '68' '42' '82' '11' '987' '44' '61'
'14' '56' '58' '54' '47' '41' nan '3' '75' '1' '57' '32' '39' '15' '23'
'6' '5' '26' '48' '986' '971' '89' '25' '12' '88' '65' '53' '70' '46'
'90' '8' '79' '977' '55' '978' '975']
------------------------------------------------
Number of values in " com " {11421}
['93053' '93066' '92036' ... '67473' '85099' '76462']
------------------------------------------------
Number of values in " adr " {31934}
['AUTOROUTE A3' 'AUTOROUTE A1' 'AUTOROUTE A86' ... 'Route de Castelnau'
'Route de Nieul-le-Dolent' "Boulevard l'Alouette"]
------------------------------------------------
sns.set(font_scale = 1.5)
plt.figure(figsize=(10, 30))
plt.title('Number of accidents in 2019 per Department')
sns.countplot(y=characteristics['dep'])
plt.xlabel("Number of accidents")
plt.ylabel("Department")
plt.show()
sns.set(font_scale = 1.5)
fig, ax = plt.subplots(3,4, figsize=(30, 15))
i=0
for col in characteristics.select_dtypes(include=['float64','int64']):
sns.distplot(characteristics[col],label=col,ax=ax[i//4][i%4])
i=i+1
fig.show()
Comments
- We can notice on the first graph (accidents by department) that the number 75 skyrockets compared to the others
- We can notice on the second graph that the feature "an" that corresponds to "year" has no variance (in fact the whole dataset is based only on the year 2019 so thats pretty much obvious). We will get rid of this feature in the future.
- The two features "lat" and "long" corresponding to the Latitude and the Longitude of the accident are not scaled. There is a factor of 1e7 (to be changed)
vehicles
for col in vehicles.select_dtypes("object"):
print('\n')
print('Number of values in "',col,'"', {vehicles[col].nunique()})
print(vehicles[col].unique())
print('\n')
print('------------------------------------------------')
Number of values in " num_veh " {48}
['B01' 'A01' 'C01' 'Z01' 'D01' 'E01' 'AB01' 'Y01' 'I01' 'T01' 'O01' 'G01'
'F01' 'PB01' 'FB01' 'M01' 'LB01' 'H01' 'J01' 'K01' 'L01' 'CB01' 'X01'
'N01' 'W01' 'U01' 'V01' 'MB01' 'RA01' 'TC01' 'R01' 'Q01' 'GB01' 'MA01'
'VB01' 'RC01' 'BA01' 'TB01' '[01' '\\01' 'VF01' 'ZZ01' 'P01' 'DA01'
'AA01' 'BB01' 'ZB01' 'BC01']
------------------------------------------------
sns.set(font_scale = 1.5)
plt.figure(figsize=(10, 15))
plt.title('Number of accidents in 2019 per number of occupants (by category)')
sns.countplot(y=vehicles['num_veh'])
plt.xlabel("Number of accidents")
plt.ylabel("Category")
plt.show()
sns.set(font_scale = 1.5)
fig, ax = plt.subplots(3,3, figsize=(30, 10))
i=0
for col in vehicles.select_dtypes(include=['float64','int64']):
sns.distplot(vehicles[col],label=col,ax=ax[i//3][i%3])
i=i+1
fig.show()
Comments
- We can notice that the feature "obs" has a low variance
places
for col in places.select_dtypes("object"):
print('\n')
print('Number of values in "',col,'"', {places[col].nunique()})
print(places[col].unique())
print('\n')
print('------------------------------------------------')
Number of values in " voie " {14327}
['3' '1' '86' ... 'PRESIDENT ROOSEVELT (RUE DU) N°2 A 26' 'VC du Rocher'
'Gabriel Péri (AV)']
------------------------------------------------
Number of values in " v2 " {35}
[nan 'INT' 'B' 'D' ' -' 'A' 'N' 'R' 'E' 'F' 'C' 'EXT' 'Z' 'I' 'X' 'W' 'Y'
'b' 'G' 'V' 'U' 'L' 'M' 'H' 'T' 'S' 'P' 'O' '15' 'K' 'II' '1A' 'IV' ' D'
'A1' 'CD']
------------------------------------------------
sns.set(font_scale = 1.5)
fig, ax = plt.subplots(3,5, figsize=(30, 15))
i=0
for col in places.select_dtypes(include=['float64','int64']):
sns.distplot(places[col],label=col,ax=ax[i//5][i%5])
i=i+1
fig.show()
Comments
- We can notice that the feature "V1","vosp","pr" have a low variance
drivers
for col in drivers.select_dtypes("object"):
print('\n')
print('Number of values in "',col,'"', {drivers[col].nunique()})
print(drivers[col].unique())
print('\n')
print('------------------------------------------------')
Number of values in " num_veh " {29}
['B01' 'A01' 'C01' 'D01' 'E01' 'Z01' 'Y01' 'I01' 'T01' 'O01' 'G01' 'F01'
'M01' 'LB01' 'H01' 'J01' 'K01' 'L01' 'N01' 'W01' 'X01' 'U01' 'V01' 'Q01'
'MA01' 'CB01' '\\01' 'VF01' 'P01']
------------------------------------------------
Number of values in " actp " {13}
['-1' '0' '3' '2' '1' 'B' '4' '9' '5' 'A' '8' '6' '7']
------------------------------------------------
sns.set(font_scale = 1.5)
plt.figure(figsize=(10, 5))
plt.title('Number of accidents in 2019 sorted by pedestrian accions')
sns.countplot(y=drivers['actp'])
plt.xlabel("Number of accidents")
plt.ylabel("Category")
plt.show()
sns.set(font_scale = 1.5)
fig, ax = plt.subplots(3,4, figsize=(30, 15))
i=0
for col in drivers.select_dtypes(include=['float64','int64']):
sns.distplot(drivers[col],label=col,ax=ax[i//4][i%4])
i=i+1
fig.show()
Comments
- We can notice that the feature "secu3" has a low variance
Summary¶
characteristics¶
- "dep" 75 skyrockets compared to the others
- "an" that corresponds to "year" has no variance (in fact the whole dataset is based only on the year 2019 so thats pretty much obvious). We will get rid of this feature in the future.
- The two features "lat" and "long" corresponding to the Latitude and the Longitude of the accident are not scaled. There is a factor of 1e7 (to be changed)
vehicles¶
- "obs" has a low variance
places¶
- "V1","vosp","pr" have a low variance
drivers¶
- "secu3" has a low variance
Cleaning data¶
characteristics = pd.read_csv('../input/2019-database-of-road-traffic-injuries/caracteristiques-2019.csv')
characteristics.name = 'characteristics'
places = pd.read_csv('../input/2019-database-of-road-traffic-injuries/lieux-2019.csv')
places.name = 'places'
drivers = pd.read_csv('../input/2019-database-of-road-traffic-injuries/usagers-2019.csv')
drivers.name = 'drivers'
vehicles = pd.read_csv('../input/2019-database-of-road-traffic-injuries/vehicules-2019.csv')
vehicles.name = 'vehicles'
datasets = [characteristics,places,vehicles,drivers]
# Indexing the tables
characteristics = characteristics.set_index('Num_Acc')
places = places.set_index('Num_Acc')
vehicles = vehicles.set_index('id_vehicule')
drivers = drivers.set_index('id_vehicule')
# Dealing with features with too many NaNs
vehicles = vehicles.drop('occutc',axis=1)
places = places.drop(['v2','lartpc','larrout'],axis=1)
# Dealing with features according to the EDA
characteristics = characteristics.drop(['an','adr','com'],axis=1)
characteristics['lat']=characteristics['lat']/10000000
characteristics['long']=characteristics['long']/10000000
characteristics = characteristics.drop('201900033874',axis=0)
#characteristics = characteristics[characteristics['dep']!='2B'] # comment / uncomment
#characteristics = characteristics[characteristics['dep']!='2A'] # comment / uncomment
#characteristics = characteristics[(characteristics['dep'].astype(float)<100)] # comment / uncomment
#places = places.loc[characteristics.index.values] # comment / uncomment
places = places.drop('201900033874',axis=0) # comment / uncomment
places = places.drop(['v1','vosp','pr','voie'],axis=1)
vehicles = vehicles.drop('obs',axis=1)
drivers = drivers.drop(['secu3'],axis=1)
pd.set_option('display.max_row',max(characteristics.shape[0],places.shape[0],drivers.shape[0],vehicles.shape[0]))
pd.set_option('display.max_column',max(characteristics.shape[1],places.shape[1],drivers.shape[1],vehicles.shape[1]))
for df in datasets:
print ("The dataset",df.name,"has",df.shape[0],"rows and",df.shape[1],"columns")
The dataset characteristics has 58840 rows and 15 columns The dataset places has 58840 rows and 18 columns The dataset vehicles has 100710 rows and 11 columns The dataset drivers has 132977 rows and 15 columns
from sklearn.preprocessing import LabelEncoder
def encoding(df):
label = LabelEncoder()
for c in df.select_dtypes("object"):
df[c]=df[c].astype("|S")
df[c]=label.fit_transform(df[c])
return df
def imputation(df):
df = df.fillna(df.median())
df = df.dropna()
return df
def preprocessing(df):
df = encoding(df)
df = imputation(df)
return df
characteristics = preprocessing(characteristics)
vehicles = preprocessing(vehicles)
places = preprocessing(places)
drivers = preprocessing(drivers)
display(HTML('<h1>characteristics</h1>'))
display(characteristics.head())
display(HTML('<h1>vehicles</h1>'))
display(vehicles.head())
display(HTML('<h1>drivers</h1>'))
display(drivers.head())
display(HTML('<h1>places</h1>'))
display(places.head())
characteristics
| jour | mois | hrmn | lum | dep | agg | int | atm | col | lat | long | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Num_Acc | |||||||||||
| 201900000001 | 30.0 | 11.0 | 0.062500 | 4.0 | 93 | 1.0 | 1.0 | 1.0 | 2.0 | 48.896210 | 2.470120 |
| 201900000002 | 30.0 | 11.0 | 0.118056 | 3.0 | 93 | 1.0 | 1.0 | 1.0 | 6.0 | 48.930700 | 2.368800 |
| 201900000003 | 28.0 | 11.0 | 0.635417 | 1.0 | 92 | 1.0 | 1.0 | 1.0 | 4.0 | 48.935872 | 2.319174 |
| 201900000004 | 30.0 | 11.0 | 0.847222 | 5.0 | 94 | 1.0 | 1.0 | 1.0 | 4.0 | 48.817329 | 2.428150 |
| 201900000005 | 30.0 | 11.0 | 0.166667 | 3.0 | 94 | 1.0 | 1.0 | 1.0 | 2.0 | 48.776362 | 2.433254 |
vehicles
| Num_Acc | num_veh | senc | catv | obsm | choc | manv | motor | |
|---|---|---|---|---|---|---|---|---|
| id_vehicule | ||||||||
| 138Â 306Â 524 | 201900000001 | 3 | 2 | 7 | 2 | 5 | 23 | 1 |
| 138Â 306Â 525 | 201900000001 | 0 | 2 | 17 | 0 | 3 | 11 | 1 |
| 138Â 306Â 523 | 201900000002 | 0 | 1 | 7 | 0 | 1 | 0 | 1 |
| 138Â 306Â 520 | 201900000003 | 0 | 1 | 7 | 2 | 1 | 2 | 1 |
| 138Â 306Â 521 | 201900000003 | 3 | 1 | 7 | 0 | 4 | 2 | 1 |
drivers
| Num_Acc | num_veh | place | catu | grav | sexe | an_nais | trajet | secu1 | secu2 | locp | actp | etatp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id_vehicule | |||||||||||||
| 138Â 306Â 524 | 201900000001 | 1 | 2 | 2 | 4 | 2 | 2002 | 0 | 1 | 0 | -1 | 0 | -1 |
| 138Â 306Â 524 | 201900000001 | 1 | 1 | 1 | 4 | 2 | 1993 | 5 | 1 | 0 | -1 | 0 | -1 |
| 138Â 306Â 525 | 201900000001 | 0 | 1 | 1 | 1 | 1 | 1959 | 0 | 1 | 0 | -1 | 0 | -1 |
| 138Â 306Â 523 | 201900000002 | 0 | 1 | 1 | 4 | 2 | 1994 | 0 | 1 | 0 | -1 | 0 | -1 |
| 138Â 306Â 520 | 201900000003 | 0 | 1 | 1 | 1 | 1 | 1996 | 0 | 1 | 0 | -1 | 1 | -1 |
places
| catr | circ | nbv | prof | pr1 | plan | surf | infra | situ | vma | |
|---|---|---|---|---|---|---|---|---|---|---|
| Num_Acc | ||||||||||
| 201900000001 | 1.0 | 3.0 | 10.0 | 1.0 | 900.0 | 2.0 | 1.0 | 2.0 | 1.0 | 70.0 |
| 201900000002 | 1.0 | 1.0 | 2.0 | 4.0 | 845.0 | 2.0 | 1.0 | 0.0 | 1.0 | 70.0 |
| 201900000003 | 1.0 | 3.0 | 8.0 | 1.0 | 500.0 | 3.0 | 1.0 | 0.0 | 1.0 | 90.0 |
| 201900000004 | 1.0 | 3.0 | 5.0 | 1.0 | 299.0 | 1.0 | 1.0 | 0.0 | 1.0 | 90.0 |
| 201900000005 | 1.0 | 1.0 | 3.0 | 1.0 | 0.0 | 3.0 | 1.0 | 2.0 | 1.0 | 90.0 |
lat = characteristics['lat']
lon = characteristics['long']
dep = characteristics['dep']
catr = places['catr'].map({1 : 'Highway',
2 : 'National road',
3 : 'Departmental road',
4 : 'Communal roads',
5 : 'Outside the public network',
6 : 'Parking lot open to public traffic',
7 : 'Urban metropolis roads',
9 : 'other'})
lum = characteristics['lum'].map({1 : 'Full day',
2 : 'Twilight or dawn',
3 : 'Night without public lighting',
4 : 'Night with public lighting not on',
5 : 'Night with public lighting on'})
atm = characteristics['atm'].map({1 : 'Normal',
2 : 'Light rain',
3 : 'Heavy rain',
4 : 'Snow. hail',
5 : 'Fog. smoke',
6 : 'Strong wind. storm',
7 : 'Dazzling weather',
8 : 'Cloudy weather',
9 : 'Other'})
col = characteristics['col'].map({1 : 'Two vehicles. frontal',
2 : 'Two vehicles. from the rear',
3 : 'Two vehicles. from the side',
4 : 'Three vehicles and more. in a chain',
5 : 'Three or more vehicles. multiple collisions',
6 : 'Other collision 7. No collision'})
circ = places['circ'].map({1 : 'One way',
2 : 'Bidirectional',
3 : 'A separate carriageway',
4 : 'With variable assignment channels'})
prof = places['prof'].map({1 : 'Flat',
2 : 'Slope',
3 : 'hilltop',
4 : 'Bottom of coast'})
plan = places['plan'].map({1 : 'rectilinear part',
2 : 'In a curve to the left',
3 : 'In a curve to the right',
4 : 'In "S"'})
surf = places['surf'].map({1 : 'Normal',
2 : 'Wet',
3 : 'Puddles',
4 : 'Flooded',
5 : 'Snowy',
6 : 'Mud',
7 : 'Icy',
8 : 'Fat. oil',
9 : 'Other'})
vma = places['vma']
import plotly.express as px
fig = px.scatter_mapbox(characteristics,
lat="lat",
lon="long",
hover_name=catr,
hover_data={'Light':lum,
'Atmosphere':atm,
'Collision':col,
'Regime':circ,
'Profile':prof,
'Layout':plan,
'Surface':surf,
'Speed':vma,
'long':False,
'lat':False},
zoom=4.9,
height=800,
width=800)
fig.data[0]['marker'].update(color='red') #green
fig.data[0]['marker'].update(size=3)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()